
###########################################################
##### Haiti elite network project  		          			#####
##### buid price dataset  	                      		#####
##### 2021 mar 03                   									#####
###########################################################


#####
## calculate number of firms and families in biz ownership
#####

# business ownership data
dat <- read.csv("01_Data/02_Clean/top_0911_cpi_own.csv")

dat <- data.table(dat)
dat <- dat[, num_firms := length(unique(con_final)), by = product]
dat <- dat[, num_fams := length(unique(fam)), by = product]
dat <- as.data.frame(dat)


#####
## prep consumer price data - fix dates and make long
#####

## standardized

pri <- read.csv('01_Data/01_Raw/01_Prices/ihsi cpi 01_13 standard.csv', skip = 2)

colnames(pri)[5:149] <- gsub("X", "", colnames(pri)[5:149])
colnames(pri)[5:149] <- paste0("price_",colnames(pri)[5:149])

pri2 <- reshape(pri, dir = "long", varying = c(colnames(pri[,5:149])),
                timevar = "month", sep = "_")
pri2$month <- ifelse(nchar(pri2$month)==5, paste0("0",pri2$month), pri2$month)
pri2$year <- paste0(20,strtrim(pri2$month, 2))
pri2$month2 <- sapply(strsplit(pri2$month, "\\."),"[[",2)
pri2$month2 <- car::recode(pri2$month2, "'Fev'='Feb'")
pri2$month <- car::recode(pri2$month2,"'Jan'=1; 'Feb'=2; 'Mar'=3; 'Apr'=4; 'May'=5; 'Jun'=6;
                       'Jul'=7; 'Aug'=8; 'Sep'=9; 'Oct'=10; 'Nov'=11; 'Dec'=12")
pri2 <- merge(pri2, reg, by = c('year','month'), all.x=T)
pri2$date <- paste0("1", tolower(pri2$month2), pri2$year)
pri2$date2 <- as.Date(pri2$date, format = "%d%b%Y")


## unstandardized

t <- read.csv('01_Data/01_Raw/01_Prices/ihsi cpi 01_13.csv', skip = 2)

colnames(t)[5:149] <- gsub("X", "", colnames(t)[5:149])
colnames(t)[5:149] <- paste0("price_",colnames(t)[5:149])

t2 <- reshape(t, dir = "long", varying = c(colnames(t[,5:149])),
              timevar = "month", sep = "_")
t2$month <- ifelse(nchar(t2$month)==5, paste0("0",t2$month), t2$month)
t2$year <- paste0(20,strtrim(t2$month, 2))
t2$month2 <- sapply(strsplit(t2$month, "\\."),"[[",2)
t2$month2 <- car::recode(t2$month2, "'Fev'='Feb'")
t2$month <- car::recode(t2$month2,"'Jan'=1; 'Feb'=2; 'Mar'=3; 'Apr'=4; 'May'=5; 'Jun'=6;
                       'Jul'=7; 'Aug'=8; 'Sep'=9; 'Oct'=10; 'Nov'=11; 'Dec'=12")
t2 <- merge(t2, reg, by = c('year','month'), all.x=T)
t2$date <- paste0("1", tolower(t2$month2), t2$year)
t2$date2 <- as.Date(t2$date, format = "%d%b%Y")
t2$price <- as.numeric(t2$price)

write.csv(t2, '01_Data/02_Clean/ihsi_0113_clean.csv')


## make list of cpi products

t3 <- unique(subset(t2, select = c(product, hscodes)))

## take out products that aren't imported or don't have data
t3 <- subset(t3, t3$product!="Approvisionnement en eau" & t3$product!="Charbon de bois")
t3 <- subset(t3, t3$product!="Robe" & t3$product!="Pantalon pour homme")
t3 <- smartbind(t3, data.frame('product'='Tissus', 'hscodes'='5208'))

t3 <- t3 %>%
  separate(hscodes, c('hscode_1', 'hscode_2', 'hscode_3'))
t4 <- reshape(t3, direction = 'long', sep = "_", varying = 2:4)
t4 <- subset(t4, is.na(t4$hscode)==F & t4$hscode!="", select = c('product','hscode'))
write.csv(t4, '01_Data/02_Clean/cpiprod.csv')


#####
## prepare supply prices for CPI products from usatrade data
#####

## supply prices
us <- read.csv("01_Data/02_Clean/prices_all_month.csv")
us <- subset(us, select = c(hscode_us:price_dr))

## hs-cpi product crosswalk
cpilist <- read.csv('01_Data/02_Clean/cpiprod.csv')
cpilist$X=NULL


## merge cpi prod names into us price data and drop non-CPI prods
cpilist$hs_4 <- sprintf("%04s", cpilist$hscode)
us$hs_four <- sprintf("%04d", us$hs_four)
setnames(us, c('product', 'hs_four'), c('product_us', 'hs_4'))
us2 <- merge(us, cpilist, by = "hs_4", all.x = F)


## take average price and sum value + qty from each month by product
us2 <- data.table(us2)
us2 <- us2[, list('qty_ha' = sum(qty_ha, na.rm=T), 'value_ha' = sum(value_ha, na.rm=T),
                  'price_ha' = mean(price_ha, na.rm=T), 'price_wo' = mean(price_wo, na.rm=T),
                  'price_dr' = mean(price_dr, na.rm=T)),
           by = c('product', 'year', 'month')]


## create reference price of Aug 2004 for supply prices
temp <- subset(us2, us2$year==2004 & us2$month==8, select = -c(year, month, qty_ha, value_ha))
setnames(temp, colnames(temp)[2:4], paste0(colnames(temp)[2:4], "_aug04"))
us2 <- merge(us2, temp, by = c('product'))

us2$price_sup_dr <- us2$price_dr/us2$price_dr_aug04 * 100
us2$price_sup_ha <- us2$price_ha/us2$price_ha_aug04 * 100
us2$price_sup_wo <- us2$price_wo/us2$price_wo_aug04 * 100


## log price
us2$price_sup_dr_log <- log(us2$price_sup_dr)
us2$price_sup_ha_log <- log(us2$price_sup_ha)
us2$price_sup_wo_log <- log(us2$price_sup_wo)


#####
## calculate ownership
#####

## split up family lists in business ownership data
temp = strsplit(dat$fam, "; ", fixed = T)
ncol = max(sapply(temp,length))
temp <- as.data.table(lapply(1:ncol, FUN = function(x) sapply(temp,"[",x)))
setnames(temp, paste0("V",seq(1:ncol)), paste0('fam_',seq(1,ncol)))

dat <- cbind.data.frame(dat, temp)
dat <- subset(dat, select = -c(X, X.1, fam))
ncol = ncol(temp)


## calculate number of owners
dat$nown <- ifelse(is.na(dat[,'fam_1'])==T, 0, 1)
for(i in 1:(ncol-1)){
  dat$nown <- ifelse(is.na(dat[,(which(colnames(dat)=="fam_1")+i)])==F, i, dat$nown)
}


## make long
dat2 <- reshape(dat, dir = "long", varying = paste0('fam_',seq(1:(ncol))), sep = "_")


## make new weight that takes owners into account
setnames(dat2, "pct_0911_dispo", "pct_0911_dispo_old")
dat2$pct_0911_dispo <- dat2$wgt_kg / dat2$nown
dat2$pct_0911_dispo <- ifelse(dat2$pct_0911_dispo==Inf, NA, dat2$pct_0911_dispo)
dat2$qty_fam <- dat2$qty / dat2$nown
dat2$qty_fam <- ifelse(dat2$qty_fam==Inf, NA, dat2$qty_fam)


## merge price into wgt by family
us3 <- data.table(subset(us2, select = c(product, price_sup_ha)))
us3 <- us3[, lapply(.SD, median, na.rm=T), by = list(product)]
dat2 <- merge(dat2, us3, by = 'product', all.x = T)


## calculate value by fam
dat2$value_qty_fam <- dat2$qty_fam * dat2$price_sup_ha



#####
## aggregate family data to CPI product
#####

att <- read.csv("01_Data/02_Clean/family_attributes.csv")
cent <- read.csv('01_Data/02_Clean/centrality.csv')
cent$X=NULL; att$X=NULL

fam <- merge(cent, att, by='family', all=T)
fam <- subset(fam, fam$family!="",
              select = c(family, nclust, reachability, nind,
                         coup,
                         bonw_02_wnind))

## fix fam names that have typos for better merge with other data
fam$family <- car::recode(fam$family, '"DELAFUENTE"="DE LA FUENTE"; "DADESKY"="D\'ADESKY";
                       "SAINT FORT"="ST-FORT"; "ST-FELIX"="SAINT FELIX"; "LE CORPS"="LECORPS";
                       "ST CLAIRE"="SAINT CLAIR"; "O\'BRIEN"="OBRIEN"; "LAFERIERE"="LAFERRIERE";
                       "MORISSET"="MORISET"; "MONDERSIR"="MONDESIR"; "JEANOT"="JEANNOT";
                       "MARA"="MARRA"; "LOUIS JEUNE"="LOUIS-JEUNE";
                       "PAUTYNSKY"="PAUTENSKI"; "SAINT FELIX"="SAINT-FELIX"; "FILSAIME"="FILS-AIME";
                       "BEHRMANN"="BERHMANN"; "GUILLOT"="GUILLIOT"; "ROSSEAU"="ROUSSEAU"; "MONBRUN"="MOMBRUN";
                       "JEAN BARD"="JEAN-BART";
                       "COQUILLON"="COQUILLEAU"; "DESTIN"="DESTINE"; "CROICY"="CROISSY";
                       "MILFORT"="MILFORD";
                       "EDMON"="EDMOND"; "DOCE"="DOLCE";
                       "CHAMPON"="CHAMPION"; "BOURGUIGNON"="BOURGUIGNOL"; "BOUGUIGNON"="BOURGUIGNOL";
                       "BIJOUX"="BIJOU"; "HILLEL"="HILEILE"; "AUXILA"="AUXILLA"')


## merge immigration histories

book <- read.csv('01_Data/02_Clean/all_immig.csv')
book$X=NULL
book$immig_noneur <- book$syrian
fam <- merge(fam, book, by.x = 'family', by.y = 'fam', all.x =T)


## merge coup list

coup <- read.csv("01_Data/02_Clean/coup_fams.csv", as.is = T)
coup$X=NULL
fam$coup <- ifelse(fam$family %in% coup$last_name, 1, 0)


## make rank variables
fam <- fam[order(fam$bonw_02_wnind, decreasing = T),]
fam$bonw_02_wnind_st <- standardize(fam$bonw_02_wnind)
fam$bonw_02_wnind_rank <- ifelse(is.na(fam$bonw_02_wnind)==T, NA, rank(fam$bonw_02_wnind))


## merge family data into ownership matrix
dat2 <- merge(dat2, fam, by.x = "fam", by.y = "family", all.x = T)



#####
## aggregate family variables by product weighted by market share
#####

dat3 <- data.table(subset(dat2, select = c(product, pct_0911_dispo,
                                           nclust, reachability, nind, coup,
                                           bonw_02_wnind_st,
                                           num_firms, num_fams,
                                           immig, immig_noneur),
                          is.na(dat2$fam)==F))
dat4 <- dat3[,lapply(.SD, weighted.mean, w = pct_0911_dispo, na.rm = T),
             by = list(product)]


## calculate value imported by coup participants
t <- data.table(subset(dat2, select = c(product, coup, value_qty_fam)))
t <- t[, list(value_qty_coup = sum(value_qty_fam[coup == 1], na.rm=T),
              value_qty_nocoup = sum(value_qty_fam[coup == 0], na.rm=T)),
       by = product]
dat4 <- merge(dat4, t, by = 'product')



#####
## merge product-level info into time series price data
#####

## merge in prices
pri3 <- merge(pri2, dat4, by = 'product')


## create regime vars
pri3$regime <- as.factor(pri3$regime)
pri3$product <- as.factor(pri3$product)
pri3$date3 <- as.factor(pri3$date2)


## autoc and quake dummies (year and month)
pri3$autoc <- car::recode(pri3$regime, 'c("d1", "d2", "d3")=0; else=1')
pri3$quake <- car::recode(pri3$year, '2010=1; else=0')
for (i in 1:12){
  pri3$temp <- ifelse(pri3$year=='2010' & pri3$month==i, 1, 0)
  setnames(pri3, 'temp', paste0('quake',i))
}


## log prices
pri3$price_log <- log(pri3$price)


## lagged prices
pri3 <- pri3[order(pri3$product, pri3$date3),]
pri3$price_log_lag1 <- lag_group(var=pri3$price_log, index=pri3$product, lag=-1)
pri3$price_log_lag2 <- lag_group(var=pri3$price_log, index=pri3$product, lag=-2)
pri3$price_log_lag3 <- lag_group(var=pri3$price_log, index=pri3$product, lag=-3)
pri3$price_log_lag4 <- lag_group(var=pri3$price_log, index=pri3$product, lag=-4)


#####
## merge in product characteristics
#####


## merge CPI product desc into prod characteristics data

cpilist <- read.csv('01_Data/02_Clean/cpiprod.csv')
cpilist$X=NULL
cpilist$hscode <- sprintf("%04d", cpilist$hscode)


## read in product chars
prod <- read.dta('01_Data/02_Clean/product_chars.dta')

prod$hs_4 <- sprintf("%04d", as.numeric(prod$hs_4))
prod <- merge(prod, cpilist, by.x = "hs_4", by.y = "hscode", all.x = T)


## collapse product chars into cpi product
prod <- data.table(prod)
prod2 <- prod[,lapply(.SD, FUN = function(x) mean(x, na.rm = T)), by = "product",
              .SDcols = c('bulk_ln', 'divis_ln', 'ref_lib', 'eos', 'time_con',
                          'time_bin', 'cshare', 'pci_value')]
prod2 <- data.frame(prod2)


## replace NaNs
cols <- which(colnames(prod2)!='product')
for (i in cols){
  prod2[,i] <- ifelse(prod2[,i]=="NaN", NA, prod2[,i])
}


## make dummy for consumer / input if share of consumer expenditures is zero
prod2$input <- ifelse(is.na(prod2$cshare)==T, 1, 0)


## merge into CPI price data by product
pri3 <- merge(pri3, prod2, by = "product", all.x = T)


## merge in supply prices
pri3 <- merge(pri3, us2, by = c('product', 'year', 'month'), all.x = T)


#####
## merge in gdelt data
#####

gd <- read.csv('01_Data/01_Raw/gdelt_forhaiti_monthly_conflict.csv')

## prep gdelt
gd$year <- as.numeric(strtrim(gd$month, 4))
gd$month <- as.numeric(substr(gd$month, 5, nchar(gd$month)))

## merge into pri3 by month + year
pri3 <- merge(pri3, gd, by = c('year', 'month'), all.x = T)

## fix order
pri3 <- pri3[order(pri3$product, pri3$year, pri3$month),]

pri3$noneliteconflict <- pri3$materialconflict - pri3$haitieliteconflict



#####
## finish buildings vars
#####

## make lagged prices
tolag <- grep('price_sup_[a-z]{2}_log', colnames(pri3), value = T)
periods <- seq(1,8,1)
for (i in 1:length(tolag)){
  for (t in 1:length(periods)){
    pri3$temp <- lag_group(var=pri3[,tolag[i]], index=pri3$product, lag=-periods[t])
    setnames(pri3, 'temp', paste0(tolag[i], '_lag', periods[t]))
  }
}


## standardize product chars
pri3$divis_ln_st <- standardize(pri3$divis_ln)
pri3$bulk_ln_st <- standardize(pri3$bulk_ln)
pri3$ref_st <- standardize(pri3$ref_lib)
pri3$pci_value_st <- standardize(pri3$pci_value)
pri3$time_con_st <- standardize(pri3$time_con)
pri3$cshare_st <- standardize(pri3$cshare)


## interactions of product characteristics and autocracy
pri3$autoc <- as.numeric(pri3$autoc)
pri3$bulkXautoc <- pri3$bulk_ln_st*pri3$autoc
pri3$divisXautoc <- pri3$divis_ln_st*pri3$autoc
pri3$refXautoc <- pri3$ref_st*pri3$autoc
pri3$timeXautoc <- pri3$time_con_st*pri3$autoc
pri3$pciXautoc <- pri3$pci_value_st*pri3$autoc
pri3$cshareXautoc <- pri3$cshare_st*pri3$autoc


## log number of firms and consumption share
pri3$num_firms_log <- log(pri3$num_firms)
pri3$cshare_log <- log(pri3$cshare)


## make var for change in price
pri3 <- pri3[order(pri3$product, pri3$date2),]
pri3$price_log_lag1 <- lag_group(var=pri3$price_log, index=pri3$product, lag=-1)
pri3$price_delta_log <- pri3$price_log - pri3$price_log_lag1
pri3$price_sup_ha_delta_log <- pri3$price_sup_ha_log - pri3$price_sup_ha_log_lag1
pri3$price_sup_wo_delta_log <- pri3$price_sup_wo_log - pri3$price_sup_wo_log_lag1


## make var for haiti consumer over supply price
pri3$markup <- pri3$price / pri3$price_sup_ha
pri3$markup_log <- log(pri3$markup)
pri3$markup_log_lag1 <- lag_group(var=pri3$markup_log, index=pri3$product, lag=-1)
pri3$markup_log_delta <- pri3$markup_log - pri3$markup_log_lag1


## make var for haiti consumer over world supply price
pri3$markup2 <- pri3$price / pri3$price_sup_wo
pri3$markup2_log <- log(pri3$markup2)
pri3$markup2_log_lag1 <- lag_group(var=pri3$markup2_log, index=pri3$product, lag=-1)
pri3$markup2_log_delta <- pri3$markup2_log - pri3$markup2_log_lag1


## make coupXautoc and centXautoc vars
pri3$coupXautoc <- pri3$coup*pri3$autoc
pri3$coupXquake1 <- pri3$coup*pri3$quake1
pri3$centXautoc <- pri3$bonw_02_wnind_st*pri3$autoc
pri3$centXquake1 <- pri3$bonw_02_wnind_st*pri3$quake1
pri3$coupXquake <- pri3$coup*pri3$quake
pri3$valueXautoc <- pri3$value_qty_coup*pri3$autoc
pri3$valueXquake <- pri3$value_qty_coup*pri3$quake1


## take out products that aren't imported or don't have data
pri3 <- subset(pri3, pri3$product!="Approvisionnement en eau" & pri3$product!="Charbon de bois")
pri3 <- subset(pri3, pri3$product!="Robe" & pri3$product!="Pantalon pour homme")


## make price change zero if missing and make dummy
pri3$pri_sup_miss <- ifelse(is.na(pri3$price_sup_ha_delta_log)==T, 1, 0)
pri3$price_sup_ha_delta_log_miss <- ifelse(is.na(pri3$price_sup_ha_delta_log)==T, 0, pri3$price_sup_ha_delta_log)
pri3$pri_sup_wo_miss <- ifelse(is.na(pri3$price_sup_wo_delta_log)==T, 1, 0)
pri3$price_sup_wo_delta_log_miss <- ifelse(is.na(pri3$price_sup_wo_delta_log)==T, 0, pri3$price_sup_wo_delta_log)


## make gdelt variables
pri3$usaconflict_log <- log(pri3$usaconflict + 1)
pri3$haitieliteconflict_log <- log(pri3$haitieliteconflict + 1)
pri3$materialconflict_log <- log(pri3$materialconflict)
pri3$noneliteconflict_log <- log(pri3$materialconflict - pri3$haitieliteconflict)



#####
## calculate missingness by product
#####

## missing identified family
t <- data.table(dat)
t$fam_id <- ifelse(is.na(t$fam_1)==T, 0, 1)
t <- t[,lapply(.SD, FUN = sum, na.rm = T), by = c('product', 'fam_id'), .SDcols = c('qty', 'wgt_kg')]
t <- reshape(t, dir = 'wide', idvar = 'product', timevar = 'fam_id')
t$qty.0 <- ifelse(is.na(t$qty.0)==T, 0, t$qty.0)
t$wgt_kg.0 <- ifelse(is.na(t$wgt_kg.0)==T, 0, t$wgt_kg.0)
t$qty_pct_fam_id <- t$qty.1 / (t$qty.1 + t$qty.0)
t$wgt_pct_fam_id <- t$wgt_kg.1 / (t$wgt_kg.1 + t$wgt_kg.0)
t <- subset(t, select = c(product, qty_pct_fam_id, wgt_pct_fam_id))
pri3 <- merge(pri3, t, by = 'product', all.x = T)

## missing identified firm
t <- data.table(dat)
t$firm_id <- ifelse(t$con_final=="NON DISPONIBLE" | t$con_final=="NON DISPONIBLE LAKE CHARLES", 0, 1)
t <- t[,lapply(.SD, FUN = sum, na.rm = T), by = c('product', 'firm_id'), .SDcols = c('qty', 'wgt_kg')]
t <- reshape(t, dir = 'wide', idvar = 'product', timevar = 'firm_id')
t$qty.0 <- ifelse(is.na(t$qty.0)==T, 0, t$qty.0)
t$wgt_kg.0 <- ifelse(is.na(t$wgt_kg.0)==T, 0, t$wgt_kg.0)
t$qty_pct_firm_id <- t$qty.1 / (t$qty.1 + t$qty.0)
t$wgt_pct_firm_id <- t$wgt_kg.1 / (t$wgt_kg.1 + t$wgt_kg.0)
t <- subset(t, select = c(product, qty_pct_firm_id, wgt_pct_firm_id))
pri3 <- merge(pri3, t, by = 'product', all.x = T)



#####
## write to dta
#####

## table of data of businesses and owners for CPI data
write.csv(dat2, '01_Data/02_Clean/fam_con_prod_final.csv')


## final price data
write.dta(pri3, '01_Data/02_Clean/price_final.dta')



